{
 "cells": [
  {
   "attachments": {},
   "cell_type": "markdown",
   "id": "6fdba594",
   "metadata": {},
   "source": [
    "# Query PDF Tutorial"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "id": "05999c63",
   "metadata": {},
   "source": [
    "In this tutorial, we demonstrate how to load a PDF and query it."
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "id": "40100c9c",
   "metadata": {},
   "source": [
    "<table align=\"left\">\n",
    "  <td>\n",
    "    <a target=\"_blank\" href=\"https://colab.research.google.com/github/georgia-tech-db/eva/blob/master/tutorials/12-query-pdf.ipynb\"><img src=\"https://www.tensorflow.org/images/colab_logo_32px.png\" /> Run on Google Colab</a>\n",
    "  </td>\n",
    "  <td>\n",
    "    <a target=\"_blank\" href=\"https://github.com/georgia-tech-db/eva/blob/master/tutorials/12-query-pdf.ipynb\"><img src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\" /> View source on GitHub</a>\n",
    "  </td>\n",
    "  <td>\n",
    "    <a target=\"_blank\" href=\"https://github.com/georgia-tech-db/eva/raw/master/tutorials/12-query-pdf.ipynb\"><img src=\"https://www.tensorflow.org/images/download_logo_32px.png\" /> Download notebook</a>\n",
    "  </td>\n",
    "</table><br><br>"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "id": "d53ef889",
   "metadata": {},
   "source": [
    "### Connect to EvaDB\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "cc1741d4",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Note: you may need to restart the kernel to use updated packages.\n"
     ]
    }
   ],
   "source": [
    "%pip install --quiet \"evadb[document,notebook]\"\n",
    "import evadb\n",
    "cursor = evadb.connect().cursor()\n",
    "import warnings\n",
    "warnings.filterwarnings(\"ignore\")"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "id": "cfda671a",
   "metadata": {},
   "source": [
    "### Download PDFs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "1ee8b17b",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "File ‘pdf_sample1.pdf’ already there; not retrieving.\n",
      "\n"
     ]
    }
   ],
   "source": [
    "!wget -nc \"https://www.dropbox.com/s/fv6pqdneth3l6fz/pdf_sample1.pdf\""
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "id": "1d9c44d4",
   "metadata": {},
   "source": [
    "### Load PDFs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "98061621",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Number of loaded PDF: 1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                         0\n",
       "0  Number of loaded PDF: 1"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "cursor.query(\"DROP TABLE IF EXISTS MyPDFs\").df()\n",
    "cursor.query(\"LOAD PDF 'pdf_sample1.pdf' INTO MyPDFs\").df()"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "id": "972e2b21",
   "metadata": {},
   "source": [
    "### Retrieve Text from Loaded PDFs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "f2674110",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>mypdfs._row_id</th>\n",
       "      <th>mypdfs.name</th>\n",
       "      <th>mypdfs.page</th>\n",
       "      <th>mypdfs.paragraph</th>\n",
       "      <th>mypdfs.data</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>pdf_sample1.pdf</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>HAEMETOLOGY       STUDY OF BLOOD</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>pdf_sample1.pdf</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>DEFINATION  Specialized connective tissue wit...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1</td>\n",
       "      <td>pdf_sample1.pdf</td>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "      <td>PHYSICAL CHARACTERISTICS ( 1 )  COLOUR   -- R...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1</td>\n",
       "      <td>pdf_sample1.pdf</td>\n",
       "      <td>2</td>\n",
       "      <td>3</td>\n",
       "      <td>PLASMA         SERUM</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1</td>\n",
       "      <td>pdf_sample1.pdf</td>\n",
       "      <td>2</td>\n",
       "      <td>4</td>\n",
       "      <td>[1] has fibrinogen [1] No fibrinogen</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>1</td>\n",
       "      <td>pdf_sample1.pdf</td>\n",
       "      <td>2</td>\n",
       "      <td>5</td>\n",
       "      <td>[2] has prothrombin [2] No  prothrombin</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>1</td>\n",
       "      <td>pdf_sample1.pdf</td>\n",
       "      <td>2</td>\n",
       "      <td>6</td>\n",
       "      <td>[3]  has clotting factors            V    and ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>1</td>\n",
       "      <td>pdf_sample1.pdf</td>\n",
       "      <td>2</td>\n",
       "      <td>7</td>\n",
       "      <td>[3] no  factors V   &amp;   VIII</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>1</td>\n",
       "      <td>pdf_sample1.pdf</td>\n",
       "      <td>2</td>\n",
       "      <td>8</td>\n",
       "      <td>[4]    No  platelet derived            growth ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>1</td>\n",
       "      <td>pdf_sample1.pdf</td>\n",
       "      <td>2</td>\n",
       "      <td>9</td>\n",
       "      <td>[4] Has additional platelet growth factors  th...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>1</td>\n",
       "      <td>pdf_sample1.pdf</td>\n",
       "      <td>2</td>\n",
       "      <td>11</td>\n",
       "      <td>2 methods                 [1] ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>1</td>\n",
       "      <td>pdf_sample1.pdf</td>\n",
       "      <td>2</td>\n",
       "      <td>12</td>\n",
       "      <td>SEPARATION OF CELLS AND FLUID  :-</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>1</td>\n",
       "      <td>pdf_sample1.pdf</td>\n",
       "      <td>3</td>\n",
       "      <td>1</td>\n",
       "      <td>COMPOSITION OF BLOOD</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13</th>\n",
       "      <td>1</td>\n",
       "      <td>pdf_sample1.pdf</td>\n",
       "      <td>3</td>\n",
       "      <td>2</td>\n",
       "      <td>CELLS 40%  TO    45%</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14</th>\n",
       "      <td>1</td>\n",
       "      <td>pdf_sample1.pdf</td>\n",
       "      <td>3</td>\n",
       "      <td>3</td>\n",
       "      <td>↓--------------------------------------------↓</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15</th>\n",
       "      <td>1</td>\n",
       "      <td>pdf_sample1.pdf</td>\n",
       "      <td>3</td>\n",
       "      <td>4</td>\n",
       "      <td></td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>16</th>\n",
       "      <td>1</td>\n",
       "      <td>pdf_sample1.pdf</td>\n",
       "      <td>3</td>\n",
       "      <td>5</td>\n",
       "      <td>ERYTHROCYTES – RBC Adult male  5.2 million/c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>17</th>\n",
       "      <td>1</td>\n",
       "      <td>pdf_sample1.pdf</td>\n",
       "      <td>3</td>\n",
       "      <td>6</td>\n",
       "      <td>THROMBOCYTES – PLATELETS Adults ( M &amp; F )    ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18</th>\n",
       "      <td>1</td>\n",
       "      <td>pdf_sample1.pdf</td>\n",
       "      <td>3</td>\n",
       "      <td>7</td>\n",
       "      <td>PLASMA  55%   TO    60%  ↓---------------...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>19</th>\n",
       "      <td>1</td>\n",
       "      <td>pdf_sample1.pdf</td>\n",
       "      <td>3</td>\n",
       "      <td>8</td>\n",
       "      <td>WATER  91%  TO    92%</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>20</th>\n",
       "      <td>1</td>\n",
       "      <td>pdf_sample1.pdf</td>\n",
       "      <td>3</td>\n",
       "      <td>9</td>\n",
       "      <td>SOLIDS  8%  TO 9%</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>21</th>\n",
       "      <td>1</td>\n",
       "      <td>pdf_sample1.pdf</td>\n",
       "      <td>3</td>\n",
       "      <td>10</td>\n",
       "      <td>A) INORGANIC :-  0.9%      mainly :- Na⁺, Cl¯,...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>22</th>\n",
       "      <td>1</td>\n",
       "      <td>pdf_sample1.pdf</td>\n",
       "      <td>3</td>\n",
       "      <td>11</td>\n",
       "      <td>d) Carbohydrates:-  Glucose  -----   e) Enzyme...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>23</th>\n",
       "      <td>1</td>\n",
       "      <td>pdf_sample1.pdf</td>\n",
       "      <td>5</td>\n",
       "      <td>1</td>\n",
       "      <td>FUNCTIONS OF BLOOD </td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>24</th>\n",
       "      <td>1</td>\n",
       "      <td>pdf_sample1.pdf</td>\n",
       "      <td>5</td>\n",
       "      <td>2</td>\n",
       "      <td>[ I ]  TRANSPORT           (a)  Gases       ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25</th>\n",
       "      <td>1</td>\n",
       "      <td>pdf_sample1.pdf</td>\n",
       "      <td>5</td>\n",
       "      <td>3</td>\n",
       "      <td>[iii] Body temperature  by the specific...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>26</th>\n",
       "      <td>1</td>\n",
       "      <td>pdf_sample1.pdf</td>\n",
       "      <td>5</td>\n",
       "      <td>4</td>\n",
       "      <td>[ II ]   MAINTENANCE </td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>27</th>\n",
       "      <td>1</td>\n",
       "      <td>pdf_sample1.pdf</td>\n",
       "      <td>5</td>\n",
       "      <td>5</td>\n",
       "      <td>[i] Water balance   between cells and plasma ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>28</th>\n",
       "      <td>1</td>\n",
       "      <td>pdf_sample1.pdf</td>\n",
       "      <td>5</td>\n",
       "      <td>6</td>\n",
       "      <td>pressure  with...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29</th>\n",
       "      <td>1</td>\n",
       "      <td>pdf_sample1.pdf</td>\n",
       "      <td>6</td>\n",
       "      <td>1</td>\n",
       "      <td>[ III ]  STORES of   Glucose,  aminoacids,  e...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>30</th>\n",
       "      <td>1</td>\n",
       "      <td>pdf_sample1.pdf</td>\n",
       "      <td>6</td>\n",
       "      <td>2</td>\n",
       "      <td>[ IV ] DEFENCE    by properties of Leucocytes...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>31</th>\n",
       "      <td>1</td>\n",
       "      <td>pdf_sample1.pdf</td>\n",
       "      <td>6</td>\n",
       "      <td>3</td>\n",
       "      <td>[ V ] PREVENTION OF BLOOD LOSS  by  forming a...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    mypdfs._row_id      mypdfs.name  mypdfs.page  mypdfs.paragraph  \\\n",
       "0                1  pdf_sample1.pdf            1                 1   \n",
       "1                1  pdf_sample1.pdf            1                 2   \n",
       "2                1  pdf_sample1.pdf            1                 3   \n",
       "3                1  pdf_sample1.pdf            2                 3   \n",
       "4                1  pdf_sample1.pdf            2                 4   \n",
       "5                1  pdf_sample1.pdf            2                 5   \n",
       "6                1  pdf_sample1.pdf            2                 6   \n",
       "7                1  pdf_sample1.pdf            2                 7   \n",
       "8                1  pdf_sample1.pdf            2                 8   \n",
       "9                1  pdf_sample1.pdf            2                 9   \n",
       "10               1  pdf_sample1.pdf            2                11   \n",
       "11               1  pdf_sample1.pdf            2                12   \n",
       "12               1  pdf_sample1.pdf            3                 1   \n",
       "13               1  pdf_sample1.pdf            3                 2   \n",
       "14               1  pdf_sample1.pdf            3                 3   \n",
       "15               1  pdf_sample1.pdf            3                 4   \n",
       "16               1  pdf_sample1.pdf            3                 5   \n",
       "17               1  pdf_sample1.pdf            3                 6   \n",
       "18               1  pdf_sample1.pdf            3                 7   \n",
       "19               1  pdf_sample1.pdf            3                 8   \n",
       "20               1  pdf_sample1.pdf            3                 9   \n",
       "21               1  pdf_sample1.pdf            3                10   \n",
       "22               1  pdf_sample1.pdf            3                11   \n",
       "23               1  pdf_sample1.pdf            5                 1   \n",
       "24               1  pdf_sample1.pdf            5                 2   \n",
       "25               1  pdf_sample1.pdf            5                 3   \n",
       "26               1  pdf_sample1.pdf            5                 4   \n",
       "27               1  pdf_sample1.pdf            5                 5   \n",
       "28               1  pdf_sample1.pdf            5                 6   \n",
       "29               1  pdf_sample1.pdf            6                 1   \n",
       "30               1  pdf_sample1.pdf            6                 2   \n",
       "31               1  pdf_sample1.pdf            6                 3   \n",
       "\n",
       "                                          mypdfs.data  \n",
       "0                  HAEMETOLOGY       STUDY OF BLOOD   \n",
       "1   DEFINATION  Specialized connective tissue wit...  \n",
       "2   PHYSICAL CHARACTERISTICS ( 1 )  COLOUR   -- R...  \n",
       "3                               PLASMA         SERUM   \n",
       "4               [1] has fibrinogen [1] No fibrinogen   \n",
       "5            [2] has prothrombin [2] No  prothrombin   \n",
       "6   [3]  has clotting factors            V    and ...  \n",
       "7                      [3] no  factors V   &   VIII    \n",
       "8   [4]    No  platelet derived            growth ...  \n",
       "9   [4] Has additional platelet growth factors  th...  \n",
       "10                 2 methods                 [1] ...  \n",
       "11                SEPARATION OF CELLS AND FLUID  :-    \n",
       "12                            COMPOSITION OF BLOOD     \n",
       "13                              CELLS 40%  TO    45%   \n",
       "14    ↓--------------------------------------------↓   \n",
       "15                                                     \n",
       "16  ERYTHROCYTES – RBC Adult male  5.2 million/c...  \n",
       "17  THROMBOCYTES – PLATELETS Adults ( M & F )    ...  \n",
       "18       PLASMA  55%   TO    60%  ↓---------------...  \n",
       "19                             WATER  91%  TO    92%   \n",
       "20                                SOLIDS  8%  TO 9%    \n",
       "21  A) INORGANIC :-  0.9%      mainly :- Na⁺, Cl¯,...  \n",
       "22  d) Carbohydrates:-  Glucose  -----   e) Enzyme...  \n",
       "23                               FUNCTIONS OF BLOOD   \n",
       "24  [ I ]  TRANSPORT           (a)  Gases       ...  \n",
       "25        [iii] Body temperature  by the specific...  \n",
       "26                             [ II ]   MAINTENANCE   \n",
       "27  [i] Water balance   between cells and plasma ...  \n",
       "28                                  pressure  with...  \n",
       "29  [ III ]  STORES of   Glucose,  aminoacids,  e...  \n",
       "30  [ IV ] DEFENCE    by properties of Leucocytes...  \n",
       "31  [ V ] PREVENTION OF BLOOD LOSS  by  forming a...  "
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "cursor.query(\"SELECT * FROM MyPDFs\").df()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "b476274d",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>mypdfs._row_id</th>\n",
       "      <th>mypdfs.name</th>\n",
       "      <th>mypdfs.page</th>\n",
       "      <th>mypdfs.paragraph</th>\n",
       "      <th>mypdfs.data</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>pdf_sample1.pdf</td>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "      <td>PHYSICAL CHARACTERISTICS ( 1 )  COLOUR   -- R...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   mypdfs._row_id      mypdfs.name  mypdfs.page  mypdfs.paragraph  \\\n",
       "0               1  pdf_sample1.pdf            1                 3   \n",
       "\n",
       "                                         mypdfs.data  \n",
       "0  PHYSICAL CHARACTERISTICS ( 1 )  COLOUR   -- R...  "
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "cursor.query(\"\"\"\n",
    "    SELECT *\n",
    "    FROM MyPDFs\n",
    "    WHERE page = 1 AND paragraph = 3\n",
    "\"\"\").df()"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "id": "f502eddb",
   "metadata": {},
   "source": [
    "### Create UDFs for Text Classification and Text Summarization"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "d2c46f7e",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Function TextClassifier already exists, nothin...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                   0\n",
       "0  Function TextClassifier already exists, nothin..."
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "cursor.query(\"\"\"\n",
    "    CREATE FUNCTION IF NOT EXISTS TextClassifier\n",
    "    TYPE HuggingFace\n",
    "    TASK 'text-classification'\n",
    "    MODEL 'distilbert-base-uncased-finetuned-sst-2-english'\n",
    "\"\"\").df()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "fd75bf79",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Function TextSummarizer already exists, nothin...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                   0\n",
       "0  Function TextSummarizer already exists, nothin..."
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "cursor.query(\"\"\"\n",
    "    CREATE FUNCTION IF NOT EXISTS TextSummarizer\n",
    "    TYPE HuggingFace\n",
    "    TASK 'summarization'\n",
    "    MODEL 'facebook/bart-large-cnn'\n",
    "\"\"\").df()"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "id": "33b97175",
   "metadata": {},
   "source": [
    "### Get Summaries of a Subset of Paragraphs with Negative Sentiment"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "42ddaccb",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Your max_length is set to 142, but your input_length is only 97. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=48)\n"
     ]
    }
   ],
   "source": [
    "cursor.query(\"\"\"\n",
    "    SELECT data, TextSummarizer(data)\n",
    "    FROM MyPDFs\n",
    "    WHERE page = 1 AND paragraph >= 1 AND paragraph <= 3 AND TextClassifier(data).label = 'NEGATIVE'\n",
    "\"\"\").df()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.5"
  },
  "varInspector": {
   "cols": {
    "lenName": 16,
    "lenType": 16,
    "lenVar": 40
   },
   "kernels_config": {
    "python": {
     "delete_cmd_postfix": "",
     "delete_cmd_prefix": "del ",
     "library": "var_list.py",
     "varRefreshCmd": "print(var_dic_list())"
    },
    "r": {
     "delete_cmd_postfix": ") ",
     "delete_cmd_prefix": "rm(",
     "library": "var_list.r",
     "varRefreshCmd": "cat(var_dic_list()) "
    }
   },
   "types_to_exclude": [
    "module",
    "function",
    "builtin_function_or_method",
    "instance",
    "_Feature"
   ],
   "window_display": false
  },
  "vscode": {
   "interpreter": {
    "hash": "292337e8e9747092192f4a1ef18b0951099c869b0f06eb7241460e1768f24923"
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
